import osimport numpy as npimport pandas as pdimport matplotlib as mptimport seaborn as snsimport plotly.express as pximport plotly.graph_objects as goxxxxxxxxxxdata = pd.read_csv('Latest Covid-19 India Status.csv')df = data.copy() #make a copy of the original data and only make changes to the copydf.head() #display some of the dataxxxxxxxxxxdf.shape #get number of (rows, columns)xxxxxxxxxxdf.columns #these are the featuresxxxxxxxxxxdf.info() #no null values, data type = integer, float, stringsxxxxxxxxxxdf.duplicated().sum() #get number of duplicate valuesxxxxxxxxxxdf.describe() #get statistics for the given dataxxxxxxxxxxdf.corr() #get correlation between different values #a correlation above 0.7 would be considered significantxxxxxxxxxxmpt.pyplot.figure(figsize = (9,6))sns.heatmap(df.corr(), annot=True)xxxxxxxxxx#from the correlation heatmap we can see that total cases has a high correlation with deaths, active and dischargefig = px.scatter_matrix(df, dimensions=["Total Cases", "Active", "Discharged", "Deaths"], color=df['State/UTs']) fig.update_layout( width=1200, height=1000,)fig.show()xxxxxxxxxxdf_sorted = df.copy().sort_values('Total Cases', ascending = False).reset_index(drop=True)#sort values in descending order#to get top 15 countries with maximum casesdf_sortedxxxxxxxxxx#get total number of cases in india heredf_allIndia = df_sorted.copy()allIndia=[]allIndia.append('All India') #index=0totalCases=0for i in range (36): totalCases = totalCases + int(df_sorted['Total Cases'][i])allIndia.append(totalCases) #index1active=0for i in range (36): active = active + int(df_sorted['Active'][i])allIndia.append(active)#index2discharged=0for i in range (36): discharged = discharged + int(df_sorted['Discharged'][i])allIndia.append(discharged) #index3deaths=0for i in range (36): deaths = deaths + int(df_sorted['Deaths'][i])allIndia.append(deaths) #index4activeRatio = round((100*active)/totalCases, 2)allIndia.append(activeRatio) #index5dischargeRatio = round((100*discharged)/totalCases, 2)allIndia.append(dischargeRatio) #index5deathRatio = round((100*deaths)/totalCases, 2)allIndia.append(deathRatio) #index5population=0for i in range (36): population = population + int(df_sorted['Population'][i])allIndia.append(population) #index1df_allIndia.loc[len(df_allIndia)] = allIndiadf_allIndia = df_allIndia.sort_values('Total Cases', ascending = False).reset_index(drop=True)df_allIndiaxxxxxxxxxx#let's try to figure out the ratio of people affected for india as well as each statedf_allIndia['(%) of Affected People']=round((df_allIndia['Total Cases']*100)/df_allIndia['Population'], 2)df_allIndiaxxxxxxxxxxdf_allIndia2 = df_allIndia.iloc[1:, :]fig = px.pie(df_allIndia2, values='(%) of Affected People', names = df_allIndia2['State/UTs'], title="(%) of people affected in each state with respect to the total population of the state" )fig.show()xxxxxxxxxx#add a pie chart for total casesfig = px.pie(df_sorted, values='Total Cases', names = df_sorted['State/UTs'], title="Covid cases all over India in (%)" )fig.show()fig = go.Figure()for i in range (10) : fig.add_trace(go.Bar(name = df_sorted['State/UTs'][i], x = df_sorted.columns[1:5], y = df_sorted.loc[i][1:5]))fig.update_layout(title = {"text":f'Top 10 affected states'})fig.show() #plot number of cases per state and view death ratiopx.bar(df_sorted, x="Total Cases",y="State/UTs", color="Death Ratio", width=900, height=600)#death ratio is highest in punjab#plot number of cases per state and view discharge ratiopx.bar(df_sorted, x="Total Cases",y="State/UTs", color="Discharge Ratio", width=700, height=900)#discharge ratio is highest in Arunachal pradesh#now lets look at active ratiopx.bar(df_sorted, x="Total Cases",y="State/UTs", color="Active Ratio", width=900, height=600)#Mizoram has the most active casesfig = px.choropleth( df, geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson", featureidkey='properties.ST_NM', locations='State/UTs', color='Active', color_continuous_scale='Greens')fig.update_geos(fitbounds="locations", visible=False)fig = px.choropleth( df, geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson", featureidkey='properties.ST_NM', locations='State/UTs', color='Deaths', color_continuous_scale='Reds')fig.update_geos(fitbounds="locations", visible=False)fig = px.choropleth( df, geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson", featureidkey='properties.ST_NM', locations='State/UTs', color='Total Cases', color_continuous_scale='Blues')fig.update_geos(fitbounds="locations", visible=False)fig = px.choropleth( df, geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson", featureidkey='properties.ST_NM', locations='State/UTs', color='Discharged', color_continuous_scale='Purples')fig.update_geos(fitbounds="locations", visible=False)fig = px.histogram(df_sorted, 'Total Cases', color="State/UTs", title="<b>Average cases per state</b>", width=600, height=400)fig.add_vline(x=df_sorted['Total Cases'].mean(), line_width=2, line_dash="dash", line_color="black")fig.show()x
fig = px.histogram(df_sorted, 'Active', color="State/UTs", title="<b>Average active cases per state</b>", width=600, height=400)fig.add_vline(x=df_sorted['Active'].mean(), line_width=2, line_dash="dash", line_color="black")fig.show()fig = px.scatter(df_sorted, x="Total Cases", y="Active", size="Deaths", color=df['State/UTs'], log_x=True, size_max=50)fig.show()px.scatter(df_sorted, x="Total Cases", y="Active",size="Active Ratio", color="Active Ratio",hover_name="State/UTs", log_x=True, size_max=70, width=600, height=400)